Upon further investigation I found that DIABLO is not actually using cross validation when creating it’s ROC curves. To resolve this issue, I used mixOmics built in cross validation tool perf() to do leave one out cross validation and then rebuilt the ROC curves using the R package pROC :
library(phyloseq)
library(mixOmics)
library(ggfortify)
library(cowplot)
library(tidyverse)
library(magrittr)
library(cowplot)
setwd('~/Documents/Projects/poopsoup/poopsoup_two/final_analysis/')
log_helper <- function(x, min.val){
log2((x + sqrt(x^2 + min.val^2))/2)
}
#Pareto Scaling:
PS_helper <- function(x){
(x - mean(x))/sqrt(sd(x, na.rm = T))
}
#Transformation Functions:
#Log Scaling:
log_transform <- function(mtb){
mtb_nz <- mtb[ ,which(apply(mtb, 2, sum) != 0)]
min.val <- min(abs(mtb_nz[mtb_nz!=0]))/10
mtb_log_trans <- apply(mtb_nz, 2, log_helper, min.val)
return(mtb_log_trans)
}
#Pareto Scaling:
pareto_scale <- function(mtb){
mtb_scaled <- apply(mtb, 2, PS_helper)
return(mtb_scaled)
}
geoMean <- function(x) exp(mean(log(x)))
centerHelper <- function(x) log(x/geoMean(x))
clr <- function(x){
nz <- data.frame(apply(x, 2, function(y) replace(y, which(y == 0), 1)))
data.frame(apply(nz, 2, centerHelper))
}
asvtab <- readRDS('~/Documents/Projects/poopsoup/data/microbiome/asv_tab.RDS')
taxatab <- readRDS('~/Documents/Projects/poopsoup/data/microbiome/tax_tab.RDS')
metadata <- read.csv('~/Documents/Projects/poopsoup/data/microbiome/microbiome_metadata.csv')
metadata %<>% mutate(group = ifelse(group == 'A', 'C_Type', 'E_type'))
#Factorize and set the levels on the metadata
metadata$treatment %<>% factor(levels = c('fecal_stock', 'no_veg', 'broc', 'brus', 'combo', 'control_digest'))
metadata$fecal_sample %<>% factor(levels = c('T5631','T5632','T6260','T6291','T4669','T1995','T5627','T5717','T5854','T6382'))
rownames(metadata) <- metadata$sample
rownames(asvtab) <-metadata$sample
ps_raw <- phyloseq(otu_table(asvtab, taxa_are_rows = FALSE),
sample_data(metadata),
tax_table(taxatab))
#Give arbitrary names to the taxa as opposed to keeping as just DNA-sequences which identify them
taxa_names(ps_raw) <- paste0("ASV", seq(ntaxa(ps_raw)))
#Fill in missing genus names:
renames <- rownames(tax_table(ps_raw)[is.na(tax_table(ps_raw)[, 'Genus'])])
taxdf <- tax_table(ps_raw)[renames,]
renamed_genus <- unname(sapply(taxa_names(taxdf), function(x) paste0('f_', taxdf[x, 'Family'], '_', x)))
tax_table(ps_raw)[renames, 'Genus'] <- renamed_genus
#Remove the control digests, these are not relevant to our analysis
ps_raw <- ps_raw %>% subset_samples(treatment != 'control_digest')
#Agglomerate to the genus level
ps_genera <- ps_raw %>% tax_glom(taxrank = "Genus")
#Remove taxa not seen more than 3 times in at least 20% of the samples
ps_counts <- ps_genera %>% filter_taxa(function(x) sum(x > 3) > (0.2*length(x)), TRUE)
#Convert from counts to relative abundance
ps_relab <- ps_counts %>% transform_sample_counts(function(x) x / sum(x) )
#Filter out low abundance (>1e-5) taxa
ps <- ps_relab %>% filter_taxa(function(x) mean(x) > 1e-5, TRUE)
ps_final <- ps %>% subset_samples(treatment != 'fecal_stock')
#Create the count data
ps_final_c <- ps_counts %>%
filter_taxa(function(x) mean(x / sum(x)) > 1e-5, TRUE) %>%
subset_samples(treatment != 'fecal_stock')
microdata <- cbind(sample = data.frame(sample_data(ps_final))$metabolomics_neg_sample, data.frame(otu_table(ps_final))) %>%
mutate(sample = gsub('neg', 'ms', sample))
microdata_c <- cbind(sample = data.frame(sample_data(ps_final_c))$metabolomics_neg_sample, data.frame(otu_table(ps_final_c))) %>%
mutate(sample = gsub('neg', 'ms', sample))
## Metabolomics ------------------------------------------------------------
#Full datasets from Progenesis:
posugly <- read_csv('~/Documents/Projects/poopsoup/poopsoup_two/data/raw_progenesis/R_Friendly/pos_metabolome_full.csv')
negugly <- read_csv('../data/raw_progenesis/R_Friendly/neg_metabolome_full.csv')
#Metadata
mdata_neg <- read_csv('~/Documents/Projects//poopsoup/data/metabolomics/neg/metadata_neg.csv') %>%
mutate(group = ifelse(group == 'A', 'C_Type', 'E_type'))
mdata_pos <- read_csv('~/Documents/Projects/poopsoup/data/metabolomics/pos/metadata_pos.csv') %>%
mutate(group = ifelse(group == 'A', 'C_Type', 'E_type'))
#Remove features which have a CV greater than 50:
pos_full <- posugly %>%
filter(is.na(cv_g_50)) %>%
dplyr::select(compound, starts_with('pos')) %>%
mutate(compound = paste0(compound, '_pos')) %>%
pivot_longer(starts_with('pos'), names_to = 'sample', values_to = 'intensity') %>%
pivot_wider(names_from = 'compound', values_from = 'intensity')
neg_full <- negugly %>%
filter(is.na(cv_g_50)) %>%
dplyr::select(compound, starts_with('neg')) %>%
mutate(compound = paste0(compound, '_neg')) %>%
pivot_longer(starts_with('neg'), names_to = 'sample', values_to = 'intensity') %>%
pivot_wider(names_from = 'compound', values_from = 'intensity')
#Scale our data:
pos_scaled <- pos_full %>%
column_to_rownames('sample') %>%
log_transform() %>%
pareto_scale()%>%
as.data.frame()
neg_scaled <- neg_full %>%
column_to_rownames('sample') %>%
log_transform() %>%
pareto_scale()%>%
as.data.frame()
#Generate the metadata to be in the same order as our metabolomics data
mpos <- left_join(data.frame(sample = rownames(pos_scaled)), mdata_pos)
mneg <- left_join(data.frame(sample = rownames(neg_scaled)), mdata_neg)
#Data was rung thru MSCombine and then reuploaded here:
#Load in the data from MSCombine
mscp_raw <- read_csv('../../MScombine/mscPos.csv')
mscn_raw <- read_csv('../../MScombine/mscNeg.csv')
mscp <- mscp_raw %>%
dplyr::select(-neutral_mass, -mz, -charge, -rt) %>%
mutate(compound = paste0(compound, '_pos')) %>%
column_to_rownames('compound') %>%
t() %>%
as.data.frame() %>%
log_transform() %>%
pareto_scale()
mscn <- mscn_raw %>%
dplyr::select(-neutral_mass, -mz, -charge, -rt) %>%
mutate(compound = paste0(compound, '_neg')) %>%
column_to_rownames('compound') %>%
t() %>%
as.data.frame() %>%
log_transform() %>%
pareto_scale()
combined_set <- as.data.frame(cbind(mscn, mscp))
rownames(combined_set) <- gsub('neg', 'ms', rownames(combined_set))
#New metatdata for the combined sets
mcom <- mneg %>%
mutate(sample = gsub('neg', 'ms', sample))
# Multi-Omic Integration - Data Prep --------------------------
multiset_wide <- combined_set %>%
rename_with(~paste0('FT_', .x)) %>%
rownames_to_column('sample') %>%
left_join(microdata)
multiset_tidy <- multiset_wide %>%
pivot_longer(cols = starts_with('FT'), values_to = 'intensity', names_to = 'feature') %>%
pivot_longer(cols = starts_with('ASV'), values_to = 'relab', names_to = 'ASV') %>%
left_join(mcom)
rownames(microdata_c) <- NULL
micro_clr <- microdata_c %>%
column_to_rownames('sample') %>%
clr() %>%
rownames_to_column('sample')
multiset_wide_clr <- combined_set %>%
rename_with(~paste0('FT_', .x)) %>%
rownames_to_column('sample') %>%
left_join(micro_clr)
multiset_tidy_clr <- multiset_wide_clr %>%
pivot_longer(cols = starts_with('FT'), values_to = 'intensity', names_to = 'feature') %>%
pivot_longer(cols = starts_with('ASV'), values_to = 'relab', names_to = 'ASV') %>%
left_join(mcom)
ft_block <- multiset_wide_clr %>%
dplyr::select(starts_with('FT'))
asv_block <- multiset_wide %>%
dplyr::select(starts_with('ASV'))
asv_block_clr <- multiset_wide_clr %>%
dplyr::select(starts_with('ASV'))
#Recreate the metadata to make sure that they are in the same order as the features
multiset_meta <- multiset_wide %>%
dplyr::select('sample') %>%
left_join(mcom)
## Diablo - Multiblock PLSDA -----------------------------------------------
#Set up our model input features first:
Xdiablo <- list(micro = asv_block_clr,
metab = ft_block)
trt <- as.factor(multiset_meta$treatment)
#Adjust our data to handle repeated measures
Cov <- data.frame(subject_id = as.factor(multiset_meta$fecal_sample))
A <- lapply(Xdiablo, function(i) withinVariation(X = i, design = Cov))
#Create the design matrix:
design <- matrix(1, nrow = 3, ncol = 3)
diag(design) <- 0
keepX <- list(micro = c(15,15), metab = c(100, 100))
diablo <- block.splsda(X = A, Y = trt, keepX = keepX, ncomp = 8, design = design)
#Create the consensus space by finding the average loadings of each compononet
conDiablo <- data.frame(Reduce('+', diablo$variates)/2) %>%
cbind(trt)
pal <- RColorBrewer::brewer.pal(4, 'Dark2')
gpal <- c(RColorBrewer::brewer.pal(n = 4, name = 'Dark2'), '#00CCCC')
names(gpal) <- c('broc', 'brus', 'combo', 'no_veg', 'tie')
mc1
mc2
mt1
mt2
library(pROC)
set.seed(24)
dtestloo <- perf(diablo, validation = 'loo', auc = TRUE, progressBar = FALSE)
metab1 <- as.data.frame(dtestloo$predict$nrep1$metab$comp1)
metab2 <- as.data.frame(dtestloo$predict$nrep1$metab$comp2)
micro1 <- as.data.frame(dtestloo$predict$nrep1$micro$comp1)
micro2 <- as.data.frame(dtestloo$predict$nrep1$micro$comp2)
classmat <- data.frame(class = diablo$Y) %>%
mutate(broc = ifelse(class == 'broc', 1, 0)) %>%
mutate(brus = ifelse(class == 'brus', 1, 0)) %>%
mutate(combo = ifelse(class == 'combo', 1, 0)) %>%
mutate(no_veg = ifelse(class == 'no_veg', 1, 0)) %>%
dplyr::select(-class)
outputs <- list(metab_comp1 = metab1, metab_comp2 = metab2, micro_comp1 = micro1, micro_comp2 = micro2)
pull_roc <- function(prediction_list, classes){
map(prediction_list, function(x){
map2(classes, x, roc)
})
}
rocs <- pull_roc(outputs, classmat)
rocplots <- map(rocs, ggroc)
aucs <- map(rocs, function(x) map(x, auc))
p1 <- rocplots[[1]] +
geom_path(size = 2) +
geom_abline(intercept = 1) +
cowplot::theme_cowplot() +
scale_color_manual(values = pal,
name = 'Outcome',
labels = c(paste0('Broc vs All: ', round(aucs[[1]][[1]], 3)),
paste0('Brus vs All: ', round(aucs[[1]][[2]], 3)),
paste0('Combo vs All: ', round(aucs[[1]][[3]], 3)),
paste0('NC vs All: ', round(aucs[[1]][[4]], 3)))) +
ggtitle('Metabolome - Component 1') +
theme(legend.position = c(0.7,0.25))
p2 <- rocplots[[2]] +
geom_path(size = 2) +
geom_abline(intercept = 1) +
cowplot::theme_cowplot() +
scale_color_manual(values = pal,
name = 'Outcome',
labels = c(paste0('Broc vs All: ', round(aucs[[2]][[1]], 3)),
paste0('Brus vs All: ', round(aucs[[2]][[2]], 3)),
paste0('Combo vs All: ', round(aucs[[2]][[3]], 3)),
paste0('NC vs All: ', round(aucs[[2]][[4]], 3)))) +
ggtitle('Metabolome - Component 2') +
theme(legend.position = c(0.7,0.25))
p3 <- rocplots[[3]] +
geom_path(size = 2) +
geom_abline(intercept = 1) +
cowplot::theme_cowplot() +
scale_color_manual(values = pal,
name = 'Outcome',
labels = c(paste0('Broc vs All: ', round(aucs[[3]][[1]], 3)),
paste0('Brus vs All: ', round(aucs[[3]][[2]], 3)),
paste0('Combo vs All: ', round(aucs[[3]][[3]], 3)),
paste0('NC vs All: ', round(aucs[[3]][[4]], 3)))) +
ggtitle('Microbiome - Component 1') +
theme(legend.position = c(0.7,0.25))
p4 <- rocplots[[4]] +
geom_path(size = 2) +
geom_abline(intercept = 1) +
cowplot::theme_cowplot() +
scale_color_manual(values = pal,
name = 'Outcome',
labels = c(paste0('Broc vs All: ', round(aucs[[4]][[1]], 3)),
paste0('Brus vs All: ', round(aucs[[4]][[2]], 3)),
paste0('Combo vs All: ', round(aucs[[4]][[3]], 3)),
paste0('NC vs All: ', round(aucs[[4]][[4]], 3)))) +
ggtitle('Microbiome - Component 2') +
theme(legend.position = c(0.7,0.25))
p3
p4
p1
p2
ggplot(conDiablo, aes(x = comp1, y = comp2, color = trt)) +
geom_point(size = 3) +
stat_ellipse(aes(group = trt, fill = trt), geom = 'polygon', alpha = 0.4) +
theme(legend.position="bottom", legend.box = "horizontal",
legend.title = element_blank()) +
cowplot::theme_cowplot() +
xlab('Component 1') +
ylab('Component 2') +
ggtitle('Individual Plot') +
theme(aspect.ratio = 1) +
scale_color_manual(values = pal) +
scale_fill_manual(values = pal)
dfeatures_neg <- read_csv('~/Documents/Projects/poopsoup/poopsoup_two/diablo/diablo_out_neg.csv') %>%
dplyr::select(feature, GroupContrib, importance) %>%
rename('gc_metab' = 'GroupContrib')
dfeatures_pos <- read_csv('~/Documents/Projects/poopsoup/poopsoup_two/diablo/diablo_out_pos.csv') %>%
dplyr::select(feature, GroupContrib, importance) %>%
rename('gc_metab' = 'GroupContrib')
dfeatures <- rbind(dfeatures_neg, dfeatures_pos) %>%
arrange(desc(abs(importance)))
dasv_c1 <- read_csv('~/Documents/Projects/poopsoup/poopsoup_two/final_analysis/DiabloLoadings_comp1.csv') %>%
dplyr::select(name, GroupContrib, comp1) %>%
rename('gc_micro' = 'GroupContrib', 'ASV' = name, 'importance' = comp1)
dasv_c2 <- read_csv('~/Documents/Projects/poopsoup/poopsoup_two/final_analysis/DiabloLoadings_comp2.csv') %>%
dplyr::select(name, GroupContrib, comp2) %>%
rename('gc_micro' = 'GroupContrib', 'ASV' = name, 'importance' = comp2)
dasv <- rbind(dasv_c1, dasv_c2) %>%
unique() %>%
arrange(desc(abs(importance)))
dcir <- plotVar(diablo, style = 'ggplot2', var.names = T, plot = F) %>%
filter(names %in% c(dfeatures$feature, dasv$ASV))
circleFun <- function(center = c(-1,1),diameter = 1, npoints = 100){
r = diameter / 2
tt <- seq(0,2*pi,length.out = npoints)
xx <- center[1] + r * cos(tt)
yy <- center[2] + r * sin(tt)
return(data.frame(x = xx, y = yy))
}
circle <- circleFun(c(0,0), 2, npoints = 100)
cplot <- ggplot(dcir, aes(x, y, color = Block, shape = Block)) +
geom_point(aes(text = paste0('Feature: ', names, '\n')), size = 3) +
geom_vline(xintercept = 0) +
geom_hline(yintercept = 0) +
cowplot::theme_cowplot() +
theme(aspect.ratio = 1) +
xlab('Component 1') +
ylab('Component 2') +
ggtitle('Correlation Circle') +
geom_path(aes(x,y), data = circle, inherit.aes = F)
plotly::ggplotly(cplot, tooltip = 'text')
Spearman’s correlation was calculated between each metablomic feature of interest without respect to treatment. The results were then filtered to the features which had an importance greater than 0.1 in DIABLO. This resulted in 64 metabolomic features and 27 ASVs.
A heatmap was made between ASVs and metabolomic features. The fill of each cell represents the strength of the correlation as calculated by spearman’s rho.
cortable <- readRDS('~/Documents/Projects/poopsoup/poopsoup_two/final_analysis/diablocortable.RDS')
dcor <- cortable %>%
mutate(cor = map_dbl(spm, function(x) x$estimate))
cmt <- ggplot(dcor, aes(x = ASV, y = feature, fill = cor)) +
geom_raster(aes(text = paste0('Feature: ', feature, '\n',
'ASV: ', ASV, '\n',
'Rho: ', round(cor, 3)))) +
scale_fill_gradient(low = 'red', high = 'green') +
theme(axis.text.x = element_text(angle = 90)) +
xlab('ASV') +
ylab('Metabolomic Feature') +
ggtitle('Correlation Matrix')
plotly::ggplotly(cmt, tooltip = 'text')
To investigate the relationship between metabolomic features and ASVs we can use the most important DIABLO features of one ome, find the features in the other ome which have the strongest spearman’s correlation and plot them in a scatterplot.
First we will extract our 5 most important DIALBO features for both the metabolome and the microbiome.
topft <- dfeatures$feature[1:5]
topasv <- dasv$ASV[1:5]
head(dfeatures, n = 5)
## # A tibble: 5 × 3
## feature gc_metab importance
## <chr> <chr> <dbl>
## 1 FT_24.36_538.3077_pos combo 0.316
## 2 FT_22.73_384.2347_pos combo 0.287
## 3 FT_24.32_440.0964_pos no_veg 0.273
## 4 FT_24.75_409.2405_pos combo 0.246
## 5 FT_20.29_187.0974_neg combo -0.239
head(dasv, n = 5)
## # A tibble: 5 × 3
## ASV gc_micro importance
## <chr> <chr> <dbl>
## 1 ASV487 brus -0.706
## 2 ASV277 no_veg 0.508
## 3 ASV241 combo -0.459
## 4 ASV461 combo -0.446
## 5 ASV427 combo 0.370
Using the feature importance from DIABLO we find that the top 5 most important metabolomic features are FT_24.36_538.3077_pos, FT_22.73_384.2347_pos, FT_24.32_440.0964_pos, FT_24.75_409.2405_pos, FT_20.29_187.0974_neg. The 5 most important ASVs are ASV487, ASV277, ASV241, ASV461, and ASV427.
Using these features as nodes, we can plot the edges (as represented by spearman’s rho) between it and all other features in the other ome.
ugh <- dcor %>%
filter(feature %in% topft) %>%
dplyr::select(-data, -spm) %>%
pivot_wider(names_from = 'ASV', values_from = 'cor')
gpal <- c(RColorBrewer::brewer.pal(n = 4, name = 'Dark2'), '#00CCCC')
names(gpal) <- c('broc', 'brus', 'combo', 'no_veg', 'tie')
for(i in 1:nrow(ugh)){
tmp <- ugh[i,] %>%
pivot_longer(cols = starts_with('ASV'), names_to = 'ASV', values_to = 'cor') %>%
left_join(dfeatures) %>%
left_join(dasv, by = 'ASV')
p <- ggplot(tmp, aes(x = ASV, y = cor)) +
geom_point(aes(color = gc_micro), size = 3) +
ggtitle(paste0(tmp$feature[1], ' - Group Contribution: ', tmp$gc_metab[1])) +
scale_color_manual(values = gpal,
name = 'Microbiome Contribution') +
xlab('Feature') +
ylab('Spearmans Rho') +
theme(axis.text.x = element_text(angle = 90))
plot(p)
}
ugh2 <- dcor %>%
filter(ASV %in% topasv) %>%
dplyr::select(-data, -spm) %>%
pivot_wider(names_from = 'feature', values_from = 'cor')
for(i in 1:nrow(ugh2)){
tmp <- ugh2[i,] %>%
pivot_longer(cols = starts_with('FT'), names_to = 'feature', values_to = 'cor') %>%
left_join(dfeatures) %>%
left_join(dasv, by = 'ASV')
p <- ggplot(tmp, aes(x = feature, y = cor)) +
geom_point(aes(color = gc_metab), size = 3) +
ggtitle(paste0(tmp$ASV[1], ' - Group Contribution: ', tmp$gc_micro[1])) +
scale_color_manual(values = gpal,
name = 'Metabolome Group Contribution') +
xlab('Feature') +
ylab('Spearmans Rho') +
theme(axis.text.x = element_text(angle = 90))
plot(p)
}
Using the nodes selected from DIABLO, we will now plot pairwise scatterplots for the 5 edges which have the highest spearman’s rho:
mtiny <- multiset_tidy_clr %>%
filter(feature %in% dfeatures$feature) %>%
filter(ASV %in% dasv$ASV) %>%
dplyr::select(feature, intensity, ASV, relab, treatment)
asvedges <- dcor %>%
filter(ASV %in% topasv) %>%
dplyr::select(-data, -spm)
for(asv in topasv){
tedge <- asvedges %>%
filter(ASV == asv) %>%
arrange(desc(abs(cor))) %>%
use_series(feature) %>%
extract(1:5)
for(ft in tedge){
tsp <- mtiny %>%
filter(ASV == asv) %>%
filter(feature == ft) %>%
left_join(dfeatures) %>%
left_join(dasv, by = 'ASV')
p <- ggplot(tsp, aes(x = relab, y = intensity, color = treatment)) +
geom_point() +
xlab(paste0(asv, ' CLR Transformed Abundance')) +
ylab(paste0(ft, ' Relative Intensity')) +
ggtitle(paste0('Metabolomic Feature Group: ', tsp$gc_metab[1], '\n',
'Microbiome Group: ', tsp$gc_micro[1], '\n',
'Correlation: ', round(cor(tsp$relab, tsp$intensity, method = 'spearman'), 3))) +
scale_color_manual(values = gpal[1:4])
plot(p)
}
}
ftedges <- dcor %>%
filter(feature %in% topft)
for(ft in topft){
tedge <- ftedges %>%
filter(feature == ft) %>%
arrange(desc(abs(cor))) %>%
use_series(ASV) %>%
extract(1:5)
for(asv in tedge){
tsp <- mtiny %>%
filter(ASV == asv) %>%
filter(feature == ft) %>%
left_join(dfeatures) %>%
left_join(dasv, by = 'ASV')
p <- ggplot(tsp, aes(x = relab, y = intensity, color = treatment)) +
geom_point() +
xlab(paste0(asv, ' CLR Transformed Abundance')) +
ylab(paste0(ft, ' Relative Intensity')) +
ggtitle(paste0('Metabolomic Feature Group: ', tsp$gc_metab[1], '\n',
'Microbiome Group: ', tsp$gc_micro[1], '\n',
'Correlation: ', round(cor(tsp$relab, tsp$intensity, method = 'spearman'), 3))) +
scale_color_manual(values = gpal[1:4])
plot(p)
}
}